Text analysis: title and abstract of male and female speakers
Abstracts
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289, 230, 167)
data <- data %>% filter(!id %in% IDs)Using abstracts in English (original or translated)
data <- data %>% filter(!is.na(abstract_english)) Number of abstracts per group
table(data$gender)##
## F M
## 99 138
table(data$position_cat,data$gender)##
## F M
## others 4 1
## postdoc 21 21
## professor 21 60
## student 52 56
Tidytext
text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
abstract_english, title_english) %>%
mutate(text = paste(title_english, abstract_english)) %>%
unnest_tokens(output=word,input=text)
stop_w <- tibble(word = stopwords("en"))
# remove stopwords
text <- text_tok %>%
anti_join(stop_w, by="word") %>% arrange(word)
# remove numbers and other characters
text <- text %>% slice(-c(1:290)) %>% # number and some symbols
filter(nchar(word)!=1) %>% # letters alone
filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ") )# remove acronyms, symbols
# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals",
"ants","anurans","abundances","adjustments","adults","affects",
"applications","approaches", "bees","builds", "birds","palms",
"cerrados","challenges", "outputs", "queens", "techniques",
"continents","crops", "consequences", "questions",
"decisions","declines","determines","determinants", "defenses",
"dynamics",
"economics", "ecosystems","environments", "experiences",
"forests","grasslands",
"genetics","gifts","gradients","guides","impacts",
"increases","interactions","lives",
"landscapes","males","mammals", "mangroves","models","movements",
"mutualisms","networks","neotropics",
"opilions","phenotypes","plants","projects","paths", "perspectives",
"populations","promotes","relationships", "relations",
"resources","responses","roads","services","skulls","snakes","seeds",
"spaces", "spiders","stages", "trees", "variations",
"threats")
text$word[text$word %in% plural] <-
substr(text$word[text$word %in% plural],
1,nchar(text$word[text$word %in% plural])-1)- Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
c("abilities","ability"),
c("advancement", "advance"),
c("absent","absence"),
c("agricultural", "agriculture"),
c("agro", "agriculture" ),
c("amazonia","amazon" ),
c("amazonian","amazon" ),
c("andean","andes"),
c("apply","application"),
c("applying","application"),
c("apidae","apis"),
c("arachnida","arachnid"),
c("argue","argument"),
c("basal", "basis"),
c("behavioral","behavior"),
c("behavioural","behavior"),
c("bignonieae", "bignoniaceae"),
c("biological", "biology"),
c("brazilian","brazil"),
c("building","build"),
c("changing", "change"),
c("cnidarian", "cnidaria"),
c("coastal","coast"),
c("colour", "color"),
c("colors", "color"),
c("communities","community" ),
c("competitive", "competition"),
c("complexity", "complex"),
c("convergences", "convergence"),
c("convergent", "convergence"),
c("cordatus","cordata" ),
c("croplands","crop"),
c( "cultural", "culture"),
c("darwin's", "darwin"),
c("darwinian", "darwin"),
c("defensive", "defense"),
c("dependent","dependence"),
c("detecting","detection"),
c("determine", "determinant"),
c("developmental", "development"),
c("dispersers","dispersal"),
c("disturbed", "disturbance"),
c("diversification", "diversity"),
c("dragonflies", "dragonfly"),
c("drier", "drought"),
c("ecological", "ecology"),
c("ecologists", "ecology"),
c("endemic", "endemism"),
c("effectiveness", "efficiency"),
c("environmental", "environment"),
c("evolutionary", "evolution"),
c("expanding", "expansion"),
c("extinct", "extinction"),
c("facilitate", "facilitation"),
c("fisheries", "fishery"),
c("floral", "flora"),
c("floristic", "flora"),
c("forested", "forest"),
c("functional", "function"),
c("functionally", "function"),
c("functioning", "function"),
c("frequencies", "frequency"),
c("frequently", "frequency"),
c("frequent", "frequency"),
c("geographical", "geographic"),
c("heterogeneties", "heterogeneity"),
c("heterogeneous", "heterogeneity"),
c("histories", "history"),
c("integrated", "integration"),
c("intregating", "integration"),
c("integrative", "integration"),
c("invasive", "invasion"),
c("isotopic", "isotope"),
c("linking", "link"),
c("living", "live"),
c("mammalia", "mammal"),
c("managed", "manage"),
c("managers", "manage"),
c("mathematical", "mathematics"),
c("mates", "mating"),
c("mediated", "mediate"),
c("mechanistic", "mechanism"),
c("matrices", "matrix"),
c("migratory", "migration"),
c("mimicking", "mimicry"),
c("modeling", "model"),
c("mutualistic", "mutualism"),
c("natural", "nature"),
c("neotropical", "neotropic"),
c("northeastern", "northeast"),
c("occuring", "occur"),
c("onça", "onca"),
c("opiliones", "opilion"),
c("parasite", "parasitism"),
c("parent", "parenting"),
c("phylogenies", "phylogeny"),
c("phylogenetic", "phylogeny"),
c("phylogenomic", "phylogeny"),
c("pollinators", "pollination"),
c("protected", "protect"),
c("protective", "protect"),
c("rainfall", "rain"),
c("reconstructing", "reconstruction"),
c("regulatory", "regulation"),
c("regulates", "regulation"),
c("relation", "relationship"),
c("reproductive", "reproduction"),
c("restored", "restoration"),
c("robustness", "robust"),
c("scientific", "science"),
c("scientist", "science"),
c("sexy", "sexual"),
c("simulated", "simulation"),
c("societies", "society"),
c("social", "society"),
c("socio", "society"),
c("space", "spatial"),
c("spacio", "spatial"),
c("stabilize", "stability"),
c("stable", "stability"),
c("stories", "story"),
c("strategic", "strategy"),
c("strategies", "strategy"),
c("structured", "structure"),
c("structuring", "structure"),
c("studies", "study"),
c("studing", "study"),
c("sustainable", "sustainability"),
c("theories", "theory"),
c("theoretical", "theory"),
c("threatened", "threat"),
c("tropical", "tropic"),
c("vision", "visual")
)
lemma <- as.data.frame(lemma)
for (i in 1:dim(lemma)[1]){
text$word[text$word == lemma[i,1]] <- lemma[i,2]
}WORDS - all data
table(text$gender)##
## F M
## 10558 13483
table(text$position_cat ,text$gender)##
## F M
## others 262 139
## postdoc 2792 2494
## professor 2062 5319
## student 5351 5531
Mean number of words by title+abstract
text %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ylab("Number of words in title + abtract") ggbeeswarm::geom_quasirandom(size=3, shape=21) ## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_quasirandom
20 more common workds
text %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| species | 384 |
| ecology | 185 |
| forest | 174 |
| model | 157 |
| study | 157 |
| environment | 139 |
| evolution | 134 |
| can | 129 |
| landscape | 127 |
| population | 124 |
| diversity | 112 |
| nature | 102 |
| community | 100 |
| male | 97 |
| plant | 97 |
| different | 95 |
| patterns | 88 |
| present | 86 |
| areas | 84 |
| animal | 82 |
| interaction | 82 |
Word cloud
textplot_wordcloud(x=dfm(tokens(text$word)))par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
col="#FCA532")Word frequencies by gender
props <- text %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]ggplot(props, aes(x=proportion_M,, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
#geom_point(size=2.5, alpha=0.5)+
geom_jitter(size=2.5, alpha=0.2)+
geom_text_repel(aes(label=label), size=3.2)+
scale_x_log10(name="Male most used words",
labels = percent_format()) +
scale_y_log10(name="Female most used words",
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq.jpg", height = 5, width=7)Words that are close to the dashed line have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.
Correlation of word frequeency use between gender:
cor.test(props$proportion_F, props$proportion_M)##
## Pearson's product-moment correlation
##
## data: props$proportion_F and props$proportion_M
## t = 71.272, df = 1648, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8565945 0.8802780
## sample estimates:
## cor
## 0.8689328
Highly correlated -> it means they tend to use the same frequency of main word
20 words with the largest differences in frequency
prop2 <- props %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))ggsave("figures/abstract_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)TF IDF
text_id <- text %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))10 âexclusiveâ words for each group
text_id$word <- as.factor(text_id$word)
text_id %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()WORDS - professors only data
textP <- text %>% filter(position_cat == "professor")
table(textP$gender)##
## F M
## 2062 5319
Mean number of words by abstract
textP %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ggbeeswarm::geom_quasirandom(size=3, shape=21) 20 most commmon words
textP %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| species | 90 |
| ecology | 68 |
| environment | 52 |
| evolution | 52 |
| population | 52 |
| nature | 44 |
| plant | 43 |
| study | 42 |
| model | 41 |
| can | 39 |
| ecosystem | 38 |
| diversity | 35 |
| society | 33 |
| water | 32 |
| pollination | 30 |
| research | 30 |
| biology | 29 |
| interaction | 29 |
| science | 29 |
| present | 26 |
Words Frequency by gender
propsP <- textP %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]ggplot(propsP, aes(x=proportion_M, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
# geom_point(size=2.5, alpha=0.3) +
geom_jitter(size=2.5, alpha=0.3)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females.
Labels for the 20 words with largest differences in frequency.
Correlation of word frequeency use between gender:
cor.test(propsP$proportion_F, propsP$proportion_M)##
## Pearson's product-moment correlation
##
## data: propsP$proportion_F and propsP$proportion_M
## t = 21.015, df = 560, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6150815 0.7078441
## sample estimates:
## cor
## 0.66401
20 words with the largest differences in frequency
propP2 <- propsP %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))ggsave("figures/abstract_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)TF IDF
text_idP <- textP %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))10 âexclusiveâ words for each group
text_idP$word <- as.factor(text_idP$word)
text_idP %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()Topic model - all data
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)Choosing number of topics: comparing AIC
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
ap_lda5 <- LDA(matext, k = 5, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,ap_lda5, base=T)## AIC dAIC df
## ap_lda2 371013.1 0.0 9655
## ap_lda3 373602.8 2589.7 14482
## ap_lda4 377235.0 6221.9 19309
## ap_lda5 382315.5 11302.3 24136
two-topics model seems the most plausible model
Word-topic probabilities
10 words with the largest probabilities for each group
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities - classifying the abstracts
and comparing the two groups by gender (if there is a difference in frequency)
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 39 60
## M 63 74
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>% kable()| gender | 1 | 2 |
|---|---|---|
| F | 39% (39) | 61% (60) |
| M | 46% (63) | 54% (74) |
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ gender)Chi-square test
chisq.test(classifi$gender, classifi$topic)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: classifi$gender and classifi$topic
## X-squared = 0.76661, df = 1, p-value = 0.3813
Topic model - Professors only
matextP <- textP %>%
count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)ap_lda2P <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3P <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4P <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2P, ap_lda3P, ap_lda4P,base=T)## AIC dAIC df
## ap_lda2P 111913.7 0.0 5017
## ap_lda3P 113740.0 1826.3 7525
## ap_lda4P 116048.9 4135.2 10033
word-topic probabilities
ap_topicsP <- tidy(ap_lda2P, matrix = "beta")
ap_top_termsP <- ap_topicsP %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_termsP %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities
ap_documentsP <- tidy(ap_lda2P, matrix = "gamma")
classifiP <- ap_documentsP %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifiP$gender, classifiP$topic)##
## 1 2
## F 15 6
## M 27 32
library(janitor)
classifiP %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>% kable()| gender | 1 | 2 |
|---|---|---|
| F | 71% (15) | 29% (6) |
| M | 46% (27) | 54% (32) |
classifiP %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
geom_violin()+
facet_wrap(~ gender)Chi-square test
chisq.test(classifiP$gender, classifiP$topic)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: classifiP$gender and classifiP$topic
## X-squared = 3.1266, df = 1, p-value = 0.07702
Sentiment analysis
Chapter 2, Silge & RObinson. 2018
- The NRC lexicon categorizes words in a binary fashion (âyesâ/ânoâ) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")## # A tibble: 13,875 Ă 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ⊠with 13,865 more rows
- The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")## # A tibble: 6,786 Ă 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ⊠with 6,776 more rows
- The AFINN lexicon assigns words with a score that runs between -5 and 5, with negâ ative scores indicating negative sentiment and positive scores indicating positive senâ timent.
get_sentiments("afinn")## # A tibble: 2,477 Ă 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ⊠with 2,467 more rows
PENSAR: tem que levar em conta nĂșmero de palavras diferentes entre abstracts - principalmente se ouver diferença mĂ©dia de nĂșmero de palavras por abstract de homens e mulehres nĂ©? ou nĂŁo?
Score words difference in female and male abstracts
All data
affword <- get_sentiments("afinn")
affc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:
affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("Mean words score per abstract and gender")Professors
affword <- get_sentiments("afinn")
affcP <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:
affcP %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("Mean words score per abstract and gender")Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica.
Precisa saber como ponderar pelo total de palavras.
All data
nrcword <- get_sentiments("nrc")
nrc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin() +
geom_quasirandom()Professors
nrcword <- get_sentiments("nrc")
nrc <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin()+
geom_quasirandom()nrc %>% filter(sentiment == "positive") %>%
ggplot( aes(x=gender, y=n)) +
geom_violin() +
geom_boxplot(width=0.2) +
geom_quasirandom()+
ggtitle("Positive words")Frequency of sentiment words per abstract
All data
bingword <- get_sentiments("bing")
bing <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()+
geom_quasirandom()Professors
bingword <- get_sentiments("bing")
bing <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin() +
geom_boxplot(width=0.2) +
geom_quasirandom()